Projet VID

Données sur les crédits Allemands

Auteur·rice
Affiliation

Michael Strefeler

Département TIC

Date de publication

Version du: 1 juin, 2024

Introduction

Dans ce projet il nous est demandé de trouver les variables permettant d’obtenir le meilleur modèle de régression linéaire multiple pour déterminer si c’est une bonne idée de faire un crédit bancaire à une personne.

Analyse exploratoire des données

df <- read.csv("data/GermanCredit.csv", header=TRUE, sep=";")

# Conversion des données catégorielles en factor
df$CHK_ACCT <- as.factor(df$CHK_ACCT)
df$HISTORY <- as.factor(df$HISTORY)
df$SAV_ACCT <- as.factor(df$SAV_ACCT)
df$EMPLOYMENT <- as.factor(df$EMPLOYMENT)
df$PRESENT_RESIDENT <- as.factor(df$PRESENT_RESIDENT)
df$JOB <- as.factor(df$JOB)

Vérification des données

summary(df)
      OBS.        CHK_ACCT    DURATION     HISTORY    NEW_CAR     
 Min.   :   1.0   0:274    Min.   :-6.00   0: 40   Min.   :0.000  
 1st Qu.: 250.8   1:269    1st Qu.:12.00   1: 49   1st Qu.:0.000  
 Median : 500.5   2: 63    Median :18.00   2:530   Median :0.000  
 Mean   : 500.5   3:394    Mean   :20.89   3: 88   Mean   :0.234  
 3rd Qu.: 750.2            3rd Qu.:24.00   4:293   3rd Qu.:0.000  
 Max.   :1000.0            Max.   :72.00           Max.   :1.000  
                                                                  
    USED_CAR       FURNITURE        RADIO.TV      EDUCATION      RETRAINING   
 Min.   :0.000   Min.   :0.000   Min.   :0.00   Min.   :0.00   Min.   :0.000  
 1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.00   1st Qu.:0.00   1st Qu.:0.000  
 Median :0.000   Median :0.000   Median :0.00   Median :0.00   Median :0.000  
 Mean   :0.103   Mean   :0.181   Mean   :0.28   Mean   :0.05   Mean   :0.097  
 3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:1.00   3rd Qu.:0.00   3rd Qu.:0.000  
 Max.   :1.000   Max.   :1.000   Max.   :1.00   Max.   :1.00   Max.   :1.000  
                                                                              
     AMOUNT      SAV_ACCT EMPLOYMENT  INSTALL_RATE      MALE_DIV   
 Min.   :  250   0:603    0: 62      Min.   :1.000   Min.   :0.00  
 1st Qu.: 1366   1:103    1:172      1st Qu.:2.000   1st Qu.:0.00  
 Median : 2320   2: 63    2:339      Median :3.000   Median :0.00  
 Mean   : 3271   3: 48    3:174      Mean   :2.973   Mean   :0.05  
 3rd Qu.: 3972   4:183    4:253      3rd Qu.:4.000   3rd Qu.:0.00  
 Max.   :18424                       Max.   :4.000   Max.   :1.00  
                                                                   
  MALE_SINGLE    MALE_MAR_or_WID  CO.APPLICANT     GUARANTOR     
 Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :-1.000  
 1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.: 0.000  
 Median :1.000   Median :0.000   Median :0.000   Median : 0.000  
 Mean   :0.549   Mean   :0.092   Mean   :0.041   Mean   : 0.051  
 3rd Qu.:1.000   3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.: 0.000  
 Max.   :2.000   Max.   :1.000   Max.   :1.000   Max.   : 1.000  
                                                                 
 PRESENT_RESIDENT  REAL_ESTATE    PROP_UNKN_NONE       AGE        
 1:130            Min.   :0.000   Min.   :0.000   Min.   : 19.00  
 2:308            1st Qu.:0.000   1st Qu.:0.000   1st Qu.: 27.00  
 3:149            Median :0.000   Median :0.000   Median : 33.00  
 4:413            Mean   :0.282   Mean   :0.154   Mean   : 35.53  
                  3rd Qu.:1.000   3rd Qu.:0.000   3rd Qu.: 42.00  
                  Max.   :1.000   Max.   :1.000   Max.   :151.00  
                                                  NA's   :14      
 OTHER_INSTALL        RENT          OWN_RES       NUM_CREDITS    JOB    
 Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :1.000   0: 22  
 1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:1.000   1:200  
 Median :0.000   Median :0.000   Median :1.000   Median :1.000   2:630  
 Mean   :0.186   Mean   :0.179   Mean   :0.713   Mean   :1.407   3:148  
 3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:1.000   3rd Qu.:2.000          
 Max.   :1.000   Max.   :1.000   Max.   :1.000   Max.   :4.000          
                                                                        
 NUM_DEPENDENTS    TELEPHONE        FOREIGN         RESPONSE  
 Min.   :1.000   Min.   :0.000   Min.   :0.000   Min.   :0.0  
 1st Qu.:1.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.0  
 Median :1.000   Median :0.000   Median :0.000   Median :1.0  
 Mean   :1.155   Mean   :0.404   Mean   :0.037   Mean   :0.7  
 3rd Qu.:1.000   3rd Qu.:1.000   3rd Qu.:0.000   3rd Qu.:1.0  
 Max.   :2.000   Max.   :1.000   Max.   :1.000   Max.   :1.0  
                                                              
library(ggplot2)
library(gridExtra)

# Liste des colonnes sans "OBS."
columns_to_plot <- setdiff(names(df), c("OBS."))

# Liste des graphiques
plots <- list()

# Boucle pour faire un graphique par colonne
for (col in columns_to_plot) {
  # Création du graphique
  p <- ggplot(df, aes_string(x = col)) +
    geom_bar() +
    ggtitle(paste("Colonne: ", col)) +
    xlab(col) +
    theme_minimal()
  
  # Ajout du graphique à la liste
  plots[[col]] <- p
}

# Affichage des graphiques (2 colonnes)
do.call(grid.arrange, c(plots, ncol = 2))

Voici les données observées que ne jouent pas avec la donnée: valeur min DURATION -6?, MALE_SINGLE une valeur à 2, GUARANTOR -1, PRESENT_RESIDENT valeurs de 1 à 4 alors que la données dit valeurs 0 à 3, valeur max AGE 151. Pour AGE il y a 14 valeurs manquantes.

Voici les corrections validées par le client:

Correction des erreurs dans les données

Premier modèle de régression multiple

df.lm <- lm(formula = RESPONSE ~ . - OBS., data = df)
summary(df.lm)

Call:
lm(formula = RESPONSE ~ . - OBS., data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.06285 -0.30776  0.08616  0.28136  0.90095 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)        7.732e-01  1.771e-01   4.367  1.4e-05 ***
CHK_ACCT1          8.250e-02  3.694e-02   2.233 0.025765 *  
CHK_ACCT2          1.926e-01  5.817e-02   3.311 0.000964 ***
CHK_ACCT3          2.830e-01  3.487e-02   8.117  1.5e-15 ***
DURATION          -4.514e-03  1.517e-03  -2.976 0.002991 ** 
HISTORY1           4.700e-03  9.001e-02   0.052 0.958366    
HISTORY2           1.455e-01  7.084e-02   2.054 0.040292 *  
HISTORY3           1.976e-01  7.799e-02   2.534 0.011445 *  
HISTORY4           2.608e-01  7.164e-02   3.641 0.000287 ***
NEW_CAR           -1.179e-01  6.094e-02  -1.935 0.053339 .  
USED_CAR           9.934e-02  6.995e-02   1.420 0.155902    
FURNITURE          3.491e-03  6.363e-02   0.055 0.956266    
RADIO.TV           1.124e-02  6.069e-02   0.185 0.853160    
EDUCATION         -1.433e-01  8.013e-02  -1.788 0.074069 .  
RETRAINING        -1.494e-02  7.012e-02  -0.213 0.831349    
AMOUNT            -1.556e-05  7.239e-06  -2.150 0.031807 *  
SAV_ACCT1          4.518e-02  4.442e-02   1.017 0.309438    
SAV_ACCT2          8.090e-02  5.471e-02   1.479 0.139578    
SAV_ACCT3          1.569e-01  6.209e-02   2.528 0.011637 *  
SAV_ACCT4          1.205e-01  3.576e-02   3.369 0.000785 ***
EMPLOYMENT1        1.725e-03  7.011e-02   0.025 0.980373    
EMPLOYMENT2        5.776e-02  6.688e-02   0.864 0.387988    
EMPLOYMENT3        1.303e-01  6.999e-02   1.861 0.062989 .  
EMPLOYMENT4        6.912e-02  6.676e-02   1.035 0.300754    
INSTALL_RATE      -4.407e-02  1.326e-02  -3.324 0.000923 ***
MALE_DIV          -6.800e-02  6.319e-02  -1.076 0.282187    
MALE_SINGLE        7.602e-02  3.208e-02   2.370 0.017987 *  
MALE_MAR_or_WID    2.539e-02  4.953e-02   0.513 0.608342    
CO.APPLICANT      -6.803e-02  6.574e-02  -1.035 0.301023    
GUARANTOR          1.770e-01  5.928e-02   2.986 0.002896 ** 
PRESENT_RESIDENT2 -1.237e-01  4.438e-02  -2.787 0.005419 ** 
PRESENT_RESIDENT3 -6.385e-02  4.994e-02  -1.279 0.201341    
PRESENT_RESIDENT4 -5.693e-02  4.497e-02  -1.266 0.205859    
REAL_ESTATE        3.227e-02  3.189e-02   1.012 0.311889    
PROP_UNKN_NONE    -8.947e-02  5.976e-02  -1.497 0.134657    
AGE                1.653e-03  1.298e-03   1.273 0.203350    
OTHER_INSTALL     -8.211e-02  3.489e-02  -2.353 0.018820 *  
RENT              -1.062e-01  7.303e-02  -1.454 0.146390    
OWN_RES           -3.417e-02  7.024e-02  -0.486 0.626785    
NUM_CREDITS       -3.908e-02  2.803e-02  -1.394 0.163515    
JOB1              -8.530e-02  1.033e-01  -0.825 0.409399    
JOB2              -9.774e-02  1.006e-01  -0.971 0.331615    
JOB3              -7.416e-02  1.023e-01  -0.725 0.468507    
NUM_DEPENDENTS    -3.964e-02  3.820e-02  -1.038 0.299742    
TELEPHONE          5.015e-02  2.968e-02   1.690 0.091422 .  
FOREIGN            1.622e-01  7.035e-02   2.306 0.021323 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.3972 on 940 degrees of freedom
  (14 observations deleted due to missingness)
Multiple R-squared:  0.2853,    Adjusted R-squared:  0.2511 
F-statistic:  8.34 on 45 and 940 DF,  p-value: < 2.2e-16
library(ggResidpanel)
resid_interact(df.lm, plots = c("resid", "qq", "cookd", "boxplot"))

Recherche du meilleur modèle

link

```{r}
library(leaps)
Best_Subset <- regsubsets(RESPONSE~., data = df, nbest = 1, nvmax = NULL, force.in = NULL, force.out =  "OBS.", method = "exhaustive")
summary_best_subset <- summary(regsubsets.out)
as.data.frame(summary_best_subset$outmat)
which.max(summary_best_subset$adjr2)
summary_best_subset$which[13,]
```

Source

```{r}
library(leaps)
models <- regsubsets(RESPONSE ~ ., data=df, nvmax=NULL, force.out = "OBS.")
summary(models)
res.sum <- summary(models)
data.frame(
  Adj.R2 = which.max(res.sum$adjr2),
  CP = which.min(res.sum$cp),
  BIC = which.min(res.sum$bic)
)
```
```{r}
get_model_formula <- function(id, object, outcome){
  # get models data
  models <- summary(object)$which[id,-1]
  # Get outcome variable
  form <- as.formula(object$call[[2]])
  outcome <- all.vars(form)[1]
  # Get model predictors
  predictors <- names(which(models == TRUE))
  predictors <- paste(predictors, collapse = "+")
  # Build model formula
  as.formula(paste0(outcome, "~", predictors))
}

#lm10 <- lm(get_model_formula(10, models, "RESPONSE"), data=df)
#summary(lm10)

#lm18 <- lm(get_model_formula(18, models, "RESPONSE"), data=df)
#summary(lm18)

#lm23 <- lm(get_model_formula(23, models, "RESPONSE"), data=df)
#summary(lm23)
```